1

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

insurance <- read.csv('insurance_cost.csv')
# install.packages("plotly")
# или 
# devtools::install_github("ropensci/plotly")
library(plotly)
## Warning: пакет 'plotly' был собран под R версии 4.2.2
## Загрузка требуемого пакета: ggplot2
## 
## Присоединяю пакет: 'plotly'
## Следующий объект скрыт от 'package:ggplot2':
## 
##     last_plot
## Следующий объект скрыт от 'package:stats':
## 
##     filter
## Следующий объект скрыт от 'package:graphics':
## 
##     layout
skimr::skim(insurance)
Data summary
Name insurance
Number of rows 1338
Number of columns 7
_______________________
Column type frequency:
character 3
numeric 4
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
sex 0 1 4 6 0 2 0
smoker 0 1 2 3 0 2 0
region 0 1 9 9 0 4 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 0 1 39.21 14.05 18.00 27.00 39.00 51.00 64.00 ▇▅▅▆▆
bmi 0 1 30.66 6.10 15.96 26.30 30.40 34.69 53.13 ▂▇▇▂▁
children 0 1 1.09 1.21 0.00 0.00 1.00 2.00 5.00 ▇▂▂▁▁
charges 0 1 13270.42 12110.01 1121.87 4740.29 9382.03 16639.91 63770.43 ▇▂▁▁▁
str(insurance)
## 'data.frame':    1338 obs. of  7 variables:
##  $ age     : int  19 18 28 33 32 31 46 37 37 60 ...
##  $ sex     : chr  "female" "male" "male" "male" ...
##  $ bmi     : num  27.9 33.8 33 22.7 28.9 ...
##  $ children: int  0 1 3 0 0 0 1 3 2 0 ...
##  $ smoker  : chr  "yes" "no" "no" "no" ...
##  $ region  : chr  "southwest" "southeast" "southeast" "northwest" ...
##  $ charges : num  16885 1726 4449 21984 3867 ...
summary(insurance)
##       age            sex                 bmi           children    
##  Min.   :18.00   Length:1338        Min.   :15.96   Min.   :0.000  
##  1st Qu.:27.00   Class :character   1st Qu.:26.30   1st Qu.:0.000  
##  Median :39.00   Mode  :character   Median :30.40   Median :1.000  
##  Mean   :39.21                      Mean   :30.66   Mean   :1.095  
##  3rd Qu.:51.00                      3rd Qu.:34.69   3rd Qu.:2.000  
##  Max.   :64.00                      Max.   :53.13   Max.   :5.000  
##     smoker             region             charges     
##  Length:1338        Length:1338        Min.   : 1122  
##  Class :character   Class :character   1st Qu.: 4740  
##  Mode  :character   Mode  :character   Median : 9382  
##                                        Mean   :13270  
##                                        3rd Qu.:16640  
##                                        Max.   :63770

2

You can also embed plots, for example:

# age histogram 

ggplot(data = insurance, 
       aes(x = age)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

# bmi histogram
plot_ly(
  insurance[insurance$bmi != 0,],
  x = ~ bmi,
  type = 'histogram',
  color = 'yellow'
)
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
# children histogram 
plot_ly(
  insurance,
  x = ~ children,
  type = 'histogram',
  color = 'pink'
)
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
# charges histogram 
plot_ly(
  insurance[insurance$charges != 0,],
  x = ~ charges,
  type = 'histogram',
  color = 'brown'
)
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

3

charge_mean <- round(mean(insurance$charges),1)
charge_median <- round(median(insurance$charges),1)

den <- ggplot(data = insurance, 
       aes(x = charges)) +
  geom_density(color="darkblue", fill="lightblue") +
  xlab("Charges") +
  ylab("Density") +
  geom_vline(aes(xintercept = charge_mean, color = 'blue')) +
  annotate("text", 
           x= charge_mean+10000, 
           y=0.00001, 
           label=paste0("Mean=", charge_mean)) + 
  geom_vline(aes(xintercept = charge_median, color = 'red')) +
  annotate("text", 
           x= charge_median+500, 
           y=0.00001, 
           label=paste0("Median=", charge_median)) + 
  theme_bw() + theme(legend.position="none")

4

#charges and sex
box1 <- ggplot() +
  geom_boxplot(data = insurance, 
               aes(x = sex, y = charges)) +
  theme_dark()
box2 <- ggplot() +
  geom_boxplot(data = insurance, 
               aes(x = smoker, y = charges)) +
  theme_get()
box3 <- ggplot() +
  geom_boxplot(data = insurance, 
               aes(x = region, y = charges)) +
  theme_grey()

5

library(ggpubr)

combine_plot <- ggarrange(den, ggarrange(box1, box2, box3, ncol = 3, labels = c("B", "C", "D")), nrow = 2, labels = "A") # Указываем формат расположения графиков в виде таблицы, где будет 2 ячейки (для графиков) по колонкам и 1 по строке
combine_plot + 
  theme_void() + 
  ggtitle('Characterisation of Charges value')

6

charge_mean <- round(mean(insurance$charges),1)
charge_median <- round(median(insurance$charges),1)

den <- ggplot(data = insurance, 
       aes(x = charges, group = region)) +
  geom_density(color="darkblue", fill="lightblue") +
  xlab("Charges") +
  ylab("Density") +
  geom_vline(aes(xintercept = charge_mean, color = 'blue')) +
  annotate("text", 
           x= charge_mean+10000, 
           y=0.00001, 
           label=paste0("Mean=", charge_mean)) + 
  geom_vline(aes(xintercept = charge_median, color = 'red')) +
  annotate("text", 
           x= charge_median+500, 
           y=0.00001, 
           label=paste0("Median=", charge_median)) + 
  facet_grid(. ~ region) + 
  theme_bw() + theme(legend.position="none")
den

7

insurance %>% 
  ggplot(aes(x=age, y=charges)) + 
    geom_point(size=3) +
  theme(axis.text.x = element_text(size = 14)) + 
   xlab("age") +
  ylab("charges") +
  ggtitle('How charges changing with age') + theme_classic()

8 + 9

insurance %>% 
  ggplot(aes(x=age, y=charges, color = smoker, fill = smoker, group = smoker)) + 
    geom_point(size=3) + 
  theme(axis.text.x = element_text(size = 14)) +
  geom_smooth(method=lm, 
              color="red", fullrange = T,
              fill="#69b3a2", 
              se=TRUE 
              ) +
   xlab("age") +
  ylab("charges") +
  ggtitle('How charges changing with age') + theme_bw()
## `geom_smooth()` using formula 'y ~ x'

10

insurance %>% 
  ggplot(aes(x=bmi, y=charges, color = smoker, fill = smoker, group = smoker)) + 
    geom_point(size=2) + 
  theme(axis.text.x = element_text(size = 14)) +
  geom_smooth(method=lm, 
              color="red", fullrange = T,
              fill="#69b3a2", 
              se=TRUE 
              ) +
   xlab("bmi") +
  ylab("charges") +
  ggtitle('How charges changing with body mass index') + theme_bw()
## `geom_smooth()` using formula 'y ~ x'

11

А влияет ли курение на индекс массы тела в наших данных?

insurance %>% 
  ggplot(aes(x=smoker, y=bmi))+ #color = smoker, fill = smoker, group = smoker)) + 
    geom_boxplot() + 
  theme(axis.text.x = element_text(size = 14)) +
  ggtitle('Is there a relationship between bmi and smoking?') + 
  theme_bw()

Как видно из графика статистически значимой разницы по индексу массы тела в группах курящих и не курящих не обнаружено. Посмотрим, есть ли разница у разных полов.

insurance %>% 
  ggplot(aes(x=smoker, y=bmi, fill = sex))+ #color = smoker, fill = smoker, group = smoker)) + 
    geom_boxplot() + 
  theme(axis.text.x = element_text(size = 14)) +
  ggtitle('Is there a relationship between bmi and smoking related to sex?') + 
  theme_bw()

Значимых различий так же не обнаружено.

12

В 10-м задании мы увидели сильную зависимость charges от курения и bmi - наблюдается сильная корреляция у курящих с bmi и затратами страховой. Хотелось бы узнать, какие показатели здоровья меняются с возрастом у групп курящих и некурящих.

insurance %>% 
  ggplot(aes(x=age, y=bmi, color = smoker, fill = smoker, group = smoker)) + 
    geom_point(size=2) + 
  theme(axis.text.x = element_text(size = 14)) +
  geom_smooth(method=lm, 
              color="red", fullrange = T,
              fill="#69b3a2", 
              se=TRUE 
              ) +
   xlab("age") +
  ylab("bmi") +
  ggtitle('How bmi changing with age and smoking') + theme_bw()
## `geom_smooth()` using formula 'y ~ x'

Никаких различий не обнаружено в группах курящих и некурящих. Попробуем поискать в регионах.

insurance %>% 
  ggplot(aes(x=bmi, y=age, color = region, fill = region, group = region)) + 
    geom_boxplot() + 
  theme(axis.text.x = element_text(size = 14)) +
  geom_smooth(method=lm, 
              color="red", fullrange = T,
              fill="#69b3a2", 
              se=TRUE 
              ) +
   xlab("age") +
  ylab("bmi") +
  ggtitle('How bmi changing with age and region') + theme_bw()
## `geom_smooth()` using formula 'y ~ x'

Cнова не видим различий в регионах. Может быть они есть затратах на человека, в зависимости от индекса массы тела и региона?

insurance %>% 
  ggplot(aes(x=bmi, y = charges, color = region, fill = region, group = region)) + 
    geom_boxplot() + 
  theme(axis.text.x = element_text(size = 14)) +
  geom_smooth(method=lm, 
              color="red", fullrange = T,
              fill="#69b3a2", 
              se=TRUE 
              ) +
   xlab("bmi") +
  ylab("charges") +
  ggtitle('How bmi changing with age and region') + theme_bw()
## `geom_smooth()` using formula 'y ~ x'

Кажется, что да - продвигаясь с северо-запада на юго-восток видим постепенное увеличение затрат с ростом индекса массы тела. Значит, регион проживания влияет на увеличение затрат, но не зависит от абсолютного показателя индекса массы тела.